1 Z czym musieliƛmy się zmierzyć

Drzewo danych

Drzewo danych

2 Ujednolicanie zmiennych

Drzewo danych

Drzewo danych

Drzewo danych

Drzewo danych

previous <- read.csv(file = "../previous_credits_data.csv")
current <- read.csv(file = "../data/parted/data_train.csv")

2.1 Fragmenty naszych ramek

read.csv("../data/bureau_balance.csv", nrow = 5)
read.csv("../data/previous_application.csv", nrow = 5)
read.csv("../data/credit_card_balance.csv", nrow = 5)

3 Przetwarzanie

installments %>% select(SK_ID_PREV, SK_ID_CURR, DAYS_INSTALMENT,
                        DAYS_ENTRY_PAYMENT, AMT_INSTALMENT, AMT_PAYMENT) %>%
                mutate(DAYS_DELAY = DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT,
                      AMT_UNDERPAYMENT = AMT_INSTALMENT - AMT_PAYMENT) %>% 
  mutate(STATUS = if_else(is.na(DAYS_DELAY), "X",
                  if_else(DAYS_DELAY <= 0, "0",
                  if_else(DAYS_DELAY <= 30, "1",
                  if_else(DAYS_DELAY <= 60, "2",
                  if_else(DAYS_DELAY <= 90, "3",
                  if_else(DAYS_DELAY <= 120, "4", "5"))))))) %>% 
  mutate(STATUS_X = if_else(STATUS == "X", 1, 0),
         STATUS_0 = if_else(STATUS == "0", 1, 0),
         STATUS_1 = if_else(STATUS == "1", 1, 0),
         STATUS_2 = if_else(STATUS == "2", 1, 0),
         STATUS_3 = if_else(STATUS == "3", 1, 0),
         STATUS_4 = if_else(STATUS == "4", 1, 0),
         STATUS_5 = if_else(STATUS == "5", 1, 0)) %>%
  select(SK_ID_PREV, SK_ID_CURR, STATUS_X, STATUS_0, STATUS_1, STATUS_2, STATUS_3, STATUS_4, STATUS_5, DAYS_INSTALMENT) %>%
  group_by(SK_ID_CURR, SK_ID_PREV) %>%
  summarise(STATUS_X = sum(STATUS_X), STATUS_0 = sum(STATUS_0), STATUS_1 = sum(STATUS_1), STATUS_2 = sum(STATUS_2), 
          STATUS_3 = sum(STATUS_3), STATUS_4 = sum(STATUS_4), STATUS_5 = sum(STATUS_5), IS_ACTIVE = max(DAYS_INSTALMENT) > -31) %>%
  mutate(SUM_LATE_MONTHS = STATUS_1 + STATUS_2 * 2 + STATUS_3 * 3 + STATUS_4 * 4 + STATUS_5 * 5) %>%
  mutate(SUM_MONTHS = STATUS_1 + STATUS_2 + STATUS_3 + STATUS_4 + STATUS_5 + STATUS_0 + STATUS_X) %>%
  mutate(RATIO_LATE_MONTHS = SUM_LATE_MONTHS/SUM_MONTHS) -> inst_mod

JOINED_prev_inst <- left_join(previous_apps, inst_mod, by = c("SK_ID_PREV", "SK_ID_CURR"))

credit_cards %>% select(SK_ID_PREV, SK_ID_CURR, AMT_PAYMENT_TOTAL_CURRENT, AMT_INST_MIN_REGULARITY, MONTHS_BALANCE) %>%
  mutate(IS_NOT_PAID = if_else(AMT_PAYMENT_TOTAL_CURRENT - AMT_INST_MIN_REGULARITY < 0, 1, 0)) %>%
  group_by(SK_ID_PREV, SK_ID_CURR) %>%
  summarise(SUM_LATE_MONTHS = sum(IS_NOT_PAID), 
            SUM_MONTHS = n(), 
            RATIO_LATE_MONTHS = SUM_LATE_MONTHS/SUM_MONTHS,
            IS_ACTIVE = max(MONTHS_BALANCE) >= -1) -> cards_mod

left_join(JOINED_prev_inst, cards_mod, by = c("SK_ID_PREV", "SK_ID_CURR"))%>%
  mutate(SUM_LATE_MONTHS = 
           if_else(is.na(SUM_LATE_MONTHS.y), if_else(is.na(SUM_LATE_MONTHS.x), 0, SUM_LATE_MONTHS.x), 
                   if_else(is.na(SUM_LATE_MONTHS.x), SUM_LATE_MONTHS.y, SUM_LATE_MONTHS.x + SUM_LATE_MONTHS.y))) %>%
  mutate(SUM_MONTHS = 
           if_else(is.na(SUM_MONTHS.y), if_else(is.na(SUM_MONTHS.x), 0, SUM_MONTHS.x), 
                   as.double(if_else(is.na(SUM_MONTHS.x), SUM_MONTHS.y, as.integer(SUM_MONTHS.x + SUM_MONTHS.y))))) %>%
  mutate(SUM_MONTHS = if_else(SUM_MONTHS < SUM_LATE_MONTHS, SUM_LATE_MONTHS, SUM_MONTHS)) %>%
  mutate(RATIO_LATE_MONTHS = if_else(is.na(SUM_MONTHS),0,SUM_LATE_MONTHS/SUM_MONTHS)) %>%
  mutate(STATUS_0 = SUM_MONTHS - SUM_LATE_MONTHS) %>%
  mutate(STATUS_1 = if_else(!is.na(SUM_LATE_MONTHS.y) & !is.na(SUM_MONTHS.y),
                            if_else(is.na(STATUS_1), SUM_LATE_MONTHS.y, STATUS_1 + SUM_LATE_MONTHS.y), 
                            if_else(is.na(STATUS_1), 0, STATUS_1))) %>%
  mutate(STATUS_X = if_else(is.na(SUM_LATE_MONTHS.y) & !is.na(SUM_MONTHS.y), 
                            if_else(is.na(STATUS_X), SUM_MONTHS.y, as.integer(STATUS_X + SUM_MONTHS.y)), as.integer(STATUS_X))) %>%
  mutate(IS_ACTIVE = if_else(!is.na(IS_ACTIVE.y) & !is.na(IS_ACTIVE.x), IS_ACTIVE.x | IS_ACTIVE.y,
                             if_else(is.na(IS_ACTIVE.y), IS_ACTIVE.x, IS_ACTIVE.y))) %>%
  mutate(UNKNOWN_STATE = if_else(is.na(IS_ACTIVE), 1, 0)) %>%
  select(-SUM_MONTHS.x, -SUM_MONTHS.y, -SUM_LATE_MONTHS.x, -SUM_LATE_MONTHS.y, -RATIO_LATE_MONTHS.x, -RATIO_LATE_MONTHS.y,
         -IS_ACTIVE.x, -IS_ACTIVE.y) %>% 
  select(SK_ID_CURR, SK_ID_PREV, NAME_CONTRACT_TYPE, STATUS_X,
                  STATUS_0, STATUS_1, STATUS_2, STATUS_3, STATUS_4, STATUS_5,
                  SUM_MONTHS, SUM_LATE_MONTHS, RATIO_LATE_MONTHS, IS_ACTIVE, UNKNOWN_STATE,
                  NAME_CONTRACT_TYPE) %>%
  filter(!is.na(SUM_MONTHS)) %>%
  mutate(STATUS_X = if_else(is.na(STATUS_X), SUM_MONTHS-STATUS_1-STATUS_0,as.double(STATUS_X)),
         STATUS_2 = if_else(is.na(STATUS_2), 0, as.double(STATUS_2)),
         STATUS_3 = if_else(is.na(STATUS_3), 0, as.double(STATUS_3)),
         STATUS_4 = if_else(is.na(STATUS_4), 0, as.double(STATUS_4)),
         STATUS_5 = if_else(is.na(STATUS_5), 0, as.double(STATUS_5))) %>% 
  mutate(CASH_LOANS = if_else(NAME_CONTRACT_TYPE == "Cash loans", 1, 0),
         CONSUMER_LOANS = if_else(NAME_CONTRACT_TYPE == "Consumer loans", 1, 0),
         REVOLVING_LOANS = if_else(NAME_CONTRACT_TYPE == "Revolving loans", 1, 0),
         UNKNOWN_TYPE_LOANS = if_else(NAME_CONTRACT_TYPE == "XNA", 1, 0)) %>%
  group_by(SK_ID_CURR) %>%
  summarise(STATUS_0 = sum(STATUS_0), STATUS_X = sum(STATUS_X), STATUS_1 = sum(STATUS_1),
            STATUS_2 = sum(STATUS_2), STATUS_3 = sum(STATUS_3), STATUS_4 = sum(STATUS_4), STATUS_5 = sum(STATUS_5),
            SUM_MONTHS = sum(SUM_MONTHS), SUM_LATE_MONTHS = sum(SUM_LATE_MONTHS), RATIO_LATE_MONTHS = SUM_LATE_MONTHS/SUM_MONTHS,
            CASH_LOANS = sum(CASH_LOANS), CONSUMER_LOANS = sum(CONSUMER_LOANS), REVOLVING_LOANS = sum(REVOLVING_LOANS), 
            UNKNOWN_TYPE_LOANS = sum(UNKNOWN_TYPE_LOANS), ACTIVE = sum(IS_ACTIVE, na.rm = TRUE), CLOSED = sum(!IS_ACTIVE, na.rm = TRUE), SOLD = 0, BAD_DEBT = 0, UNKNOWN_STATE = 0) %>%
  bind_rows(read.csv("previous_appliactions/grouped_data_plus_active.csv"))%>% 
  group_by(SK_ID_CURR) %>%
  summarise(STATUS_0 = sum(STATUS_0, na.rm = TRUE),
            STATUS_1 = sum(STATUS_1, na.rm = TRUE),
            STATUS_2 = sum(STATUS_2, na.rm = TRUE),
            STATUS_3 = sum(STATUS_3, na.rm = TRUE),
            STATUS_4 = sum(STATUS_4, na.rm = TRUE),
            STATUS_5 = sum(STATUS_5, na.rm = TRUE),
            STATUS_X = sum(STATUS_X, na.rm = TRUE),
            SUM_MONTHS = sum(SUM_MONTHS, na.rm = TRUE),
            SUM_LATE_MONTHS = sum(SUM_LATE_MONTHS, na.rm = TRUE),
            RATIO_LATE_MONTHS = if_else(is.na(SUM_MONTHS), 0, SUM_LATE_MONTHS/SUM_MONTHS),
            CASH_LOANS = sum(CASH_LOANS, na.rm = TRUE),
            CONSUMER_LOANS = sum(CONSUMER_LOANS, na.rm = TRUE),
            REVOLVING_LOANS = sum(REVOLVING_LOANS, na.rm = TRUE),
            UNKNOWN_TYPE_LOANS = sum(UNKNOWN_TYPE_LOANS, na.rm = TRUE),
            ACTIVE = sum(ACTIVE, na.rm = TRUE),
            CLOSED = sum(CLOSED, na.rm = TRUE),
            SOLD = sum(SOLD, na.rm = TRUE),
            BAD_DEBT = sum(BAD_DEBT, na.rm = TRUE),
            UNKNOWN_STATE = sum(UNKNOWN_STATE, na.rm = TRUE)) -> previous_credits_data

3.1 PoƂączone dane

Odsetek klientĂłw, ktĂłrzy brali wczeƛniej jakąƛ poĆŒyczkę:

current <- left_join(current, previous, by = "SK_ID_CURR")
sum(!is.na(current$SUM_MONTHS))/nrow(current)
## [1] 0.9917347

4 GƂówna ramka danych

GƂówną ramką danych jest application_train. Przedstawia ona wszystkie aplikacje. PoniĆŒsze wykresy zostaƂy wykonane na treningowym podzbiorze.

4.1 Target

Ramka zawiera kolumnę TARGET z wartoƛciami 0 i 1. 1 oznacza klientów, którzy mieli problemy z pƂatnoƛciami, a 0 pozostaƂych.

Ponad 90% osób nie mam problemów z pƂatnoƛciami.

4.2 ZaleĆŒnoƛci między danymi

4.2.1 Ranking regionu w ktĂłrym mieszka klient

Biorąc pod uwagę miasto

Biorąc pod uwagę miasto

Nie biorąc pod uwagę miasta

Nie biorąc pod uwagę miasta

MoĆŒna zauwaĆŒyć znaczącą rĂłĆŒnicę między osobami z rankingiem 1 i 3.

4.2.2 Czy osoba nie pracuje w mieƛcie, w którym jest zameldowana

Osoby nie pracujące w mieƛcie, w którym są zameldowane częƛciej mają problemy ze spƂatą kredytu.

4.2.3 Czy adres zameldowania nie pokrywa się z adresem kontaktowym

4.2.4 BranĆŒa, w ktĂłrej pracuje osoba aplikująca

Osoby będące pracownikami fizycznymi najczęƛciej mają problemy z terminowymi pƂatnoƛciami. Inne zagroĆŒone grupy to kierowcy, ochroniarze i ludzie pracujący w gastronomii.

4.2.5 Typ zamieszkania

Osoby mieszkające z rodzicami lub wynajmujące mieszkanie częƛciej mają problemy ze spƂatą zobowiązaƄ.

4.2.6 Rodzaj kredytu

Osoby biorące kredyt gotĂłwkowy mają problemy finansowe częƛciej, niĆŒ osoby biorące kredyt odnawialny. ### Ile osĂłb z otoczenia spoƂecznego osoby aplikującej spĂłĆșniƂo się ze spƂatą 60 dni

Osoby mające w otoczeniu spoƂecznym osoby, które zalegają ze spƂatą, same częƛciej mają problemy finansowe

4.2.7 Dostarczone dokumenty

Klienci, którzy dostarczyli dokument 3, częsciej mieli problemy ze spƂatą kredytu, tak samo jak klienci, którzy nie dostarczyli dokumentu 6.

4.2.8 ZaleĆŒnoƛci zmiennych liczbowych

  • EXT_SOURCE_2, EXT_SOURCE_3 - znormalizowane zewnętzne ĆșrĂłdƂa danych
  • DAYS_ID_PUBLISH - ile dni przed aplikacją klient zmieniƂ dokument toĆŒsamoƛci
  • DAYS_BIRTH - wiek klienta w dniach
  • DAYS_LAST_PHONE_CHANGE - ile dni przed aplikacją klient zmieniƂ telefon

Największą korelację ze zmienną TARGET mają zmienne EXT_SOURCE_2 i EXT_SOURCE_3. Na wykresach skrzynkowych moĆŒna zauwaĆŒyć, ĆŒe osoby mające problemy ze spƂatą kredytu mają niĆŒszą oceną w obu tych ĆșrĂłdƂach.

4.3 Dane o historycznych poĆŒyczkach

current <- current %>% filter(!is.na(SUM_MONTHS)) %>% select(TARGET, STATUS_0, STATUS_1, STATUS_2, STATUS_3, STATUS_4, STATUS_5, STATUS_X, SUM_MONTHS, SUM_LATE_MONTHS, RATIO_LATE_MONTHS, ACTIVE, CLOSED, BAD_DEBT, SOLD, UNKNOWN_STATE, SUM_MONTHS, CONSUMER_LOANS, REVOLVING_LOANS, UNKNOWN_TYPE_LOANS)

current$TARGET <- as.factor(current$TARGET)

ggplot(data = current, aes(x = log10(STATUS_0), fill = TARGET)) +
  geom_histogram(position = position_identity(), alpha = 0.8) +
  xlab("STATUS_0 value (log)") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = current, aes(x = log10(STATUS_1), fill = TARGET)) +
  geom_histogram(position = position_identity(), alpha = 0.8) +
  xlab("STATUS_1 value (log)") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = current, aes(x = log10(STATUS_2 + STATUS_3 + STATUS_4 + STATUS_5), fill = TARGET)) +
  geom_histogram(position = position_identity(), alpha = 0.8) +
  xlab("STATUS_>1 value (log)") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = current, aes(x = log10(SUM_MONTHS), fill = TARGET)) +
  geom_histogram(position = position_identity(), alpha = 0.8) +
  xlab("SUM_MONTHS value (log10)") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = current, aes(x = log10(ACTIVE), fill = TARGET)) +
  geom_histogram(position = position_identity(), alpha = 0.8) +
  xlab("ACTIVE value (log10)") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = current, aes(x = log10(ACTIVE), fill = TARGET)) +
  geom_histogram(position = position_identity(), alpha = 0.8) +
  xlab("CLOSED value (log10)") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

current %>% group_by(TARGET) %>% summarise(STATUS_0 = mean(STATUS_0), STATUS_1 = mean(STATUS_1), STATUS_m1 = mean(STATUS_2 + STATUS_3 + STATUS_4 + STATUS_5), ACTIVE = mean(ACTIVE), CLOSED = mean(CLOSED))
current$TARGET <- as.numeric(levels(current$TARGET))[as.integer(current$TARGET)] 

visdat::vis_cor(current)